Data Analysis

In [25]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objects as gp
import plotly.express as px
In [13]:
df = pd.read_csv("cleaned_dataset_2017_2020.csv")
In [14]:
# to find the total customers in each loyalty group
df['loyalty'].value_counts().plot.pie();
plt.savefig("Loyalty types and total customers")
In [16]:
df.groupby(['loyalty']).agg(total_customers = ('customer_id',pd.Series.nunique))
Out[16]:
total_customers
loyalty
First Time Buyer 48
Loyalist 1339
Promiscuous 1872
In [17]:
# to find the total customers in each age_band
df.groupby(['loyalty','age_band']).agg(total_customers = ('customer_id',pd.Series.nunique)).plot(figsize = (12,5), title = 'Loyalty by age_band', kind = 'barh')
plt.ylabel('Loyalty & age_band')
plt.savefig('Loyalty by age_band')
In [18]:
# to find total_revenue each year
df.transaction_date = pd.to_datetime(df.transaction_date)
df['year'] = df.transaction_date.dt.year
df['month'] = df.transaction_date.dt.month
df.groupby(['year','month']).agg(total_revenue = ('price',sum)).plot(figsize = (12,5), title = 'Revenue by year');
plt.savefig('Revenue by year')
In [47]:
df.groupby(['month']).agg(total_revenue=('price', sum)).plot(title="Minimum and Maximum revenue by month", figsize = (12,5), legend = False);
plt.savefig("Minimum and Maximum revenue by month")
In [19]:
#to find the Number of Products sold by each department
df.groupby(['department']).agg(total_products=('commodity',pd.Series.nunique)).sort_values('total_products', ascending = True).plot(figsize=(10,5),kind = 'barh', title = 'Number of Products sold by each department', legend = False)
plt.savefig('Number of Products sold by each department')
In [20]:
top_10 = df.groupby(['commodity']).agg(total_revenue=('price',sum)) \
    .sort_values('total_revenue', ascending = False).head(10)

gp.Figure(
    data = gp.Bar(x=top_10.index, y=top_10['total_revenue']),
    layout = gp.Layout(
        title ='Top 10 commodities by Revenue',height = 400,width =500,
        yaxis=dict(
            title='Revenue'
        )
    )
).show(renderer = 'iframe')
In [21]:
bottom_10 = df.groupby(['commodity']).agg(total_revenue=('price',sum)) \
    .sort_values('total_revenue', ascending = True).head(10)

gp.Figure(
    data = gp.Bar(x=bottom_10.index, y=bottom_10['total_revenue'], marker = dict(color = "red")),
    layout = gp.Layout(
        title ='Bottom 10 commodities by Revenue',height = 400,width =500,
        yaxis=dict(
            title='Revenue'
        )
    )
).show(renderer = 'iframe')
In [22]:
top_5d= df.groupby(['department']).agg(total_revenue=('price',sum)) \
    .sort_values('total_revenue', ascending = False).head(5)

gp.Figure(
    data = gp.Bar(x=top_5d.index, y=top_5d['total_revenue']),
    layout = gp.Layout(
        title ='Top 5 Departments by Revenue',height = 400,width =500,
        yaxis=dict(
            title='Revenue'
        )
    )
).show(renderer = 'iframe')
In [23]:
bottom_5d = df.groupby(['department']).agg(total_revenue=('price',sum)) \
    .sort_values('total_revenue', ascending = True).head(5)

gp.Figure(
    data = gp.Bar(x=bottom_5d.index, y=bottom_5d['total_revenue'], marker = dict(color = "red")),
    layout = gp.Layout(
        title ='Bottom 5 Departments by Revenue',height = 400,width =500,
        yaxis=dict(
            title='Revenue'
        )
    )
).show(renderer = 'iframe')
In [24]:
df.groupby(['brand']).agg(total_products = ('product_id',pd.Series.nunique)).plot(figsize = (5,5), title = 'Products by brand', kind = 'barh');

Analysis by Loyalty

In [26]:
#'Total number of Products bought by Loyalty'
df.groupby(['loyalty']).agg(total_commodity_brand=('product_id',pd.Series.nunique)).plot(figsize=(10,5),kind = 'barh', title = 'Total number of Products bought by Loyalty', legend = False);
plt.savefig('Total number of Products bought by Loyalty')
In [27]:
tmp_stats = df.groupby(['loyalty','basket_id']).agg(revenue = ('price',sum)).reset_index()
tmp_stats.groupby('loyalty').agg(avg_basket_amount = ('revenue','mean'),median_basket_amount = ('revenue','median'),total_basket_amount = ('revenue','sum'),number_of_baskets = ('basket_id','count'))
Out[27]:
avg_basket_amount median_basket_amount total_basket_amount number_of_baskets
loyalty
First Time Buyer 66.560385 68.855 3461.14 52
Loyalist 70.840921 68.870 99956.54 1411
Promiscuous 70.224487 68.085 142274.81 2026
In [28]:
df.groupby(['loyalty','transaction_date']).agg(total_revenue = ('price',sum)).reset_index()
data = []
for l in df.loyalty.unique():
    tmp = df[df.loyalty==l].groupby(['transaction_date']).agg(revenue=('price', sum)).reset_index()
    data.append(gp.Scatter(x=tmp.transaction_date, y=tmp.revenue, name = l, line=dict(dash='dash')))
gp.Figure(
    data=data,
    layout = gp.Layout(
        title ='Revenue by year based on loyalty',width = 700,
        yaxis=dict(
            title='Revenue'
        )
    )
).show(renderer = 'iframe')
In [29]:
tmp = df.groupby(['loyalty','transaction_date']).agg(total_revenue = ('price',sum)).reset_index()
plt.figure(figsize=(20,8))
sns.lineplot(data=tmp, x='transaction_date', y='total_revenue', hue='loyalty');
plt.xlabel('Date')
plt.ylabel('total_revenue')
plt.title('Revenue by year based on loyalty')
x_labels = pd.date_range(tmp.transaction_date.min(),tmp.transaction_date.max(),5)
plt.xticks(x_labels);
plt.savefig('Revenue by year based on loyalty')
In [30]:
tmp_1 = df.groupby(['loyalty', 'commodity']).agg(total_revenue=('price', sum)).reset_index()
top_5_1 = pd.concat(
    [tmp_1[tmp_1.loyalty==l] \
         .sort_values('total_revenue', ascending=False) \
     .head(5) for l in tmp_1.loyalty.unique()]).reset_index(drop=True)

tmp_1.head(5)
top_5_1.head(5)


data = []
for d in top_5_1.commodity.unique():
    tmp1 = top_5_1[top_5_1.commodity==d].groupby(['loyalty']).agg(revenue=('total_revenue', sum)).reset_index()
    data.append(gp.Bar(x=tmp1.loyalty, y=tmp1.revenue, name = d))
        
gp.Figure(
    data = data,
    layout = gp.Layout(
        title ='Top commodities per Loyalty Group',
        yaxis=dict(
            title='Revenue'
        )
    )
).show(renderer = 'iframe')
In [31]:
topfive = df.groupby(['department',]).agg(total_revenue = ('price',sum)).sort_values('total_revenue', ascending = False).reset_index().head(5)
tmp = df[df.department.isin(topfive.department.unique())].groupby(['department','commodity']).agg(total_revenue = ('price',sum)).reset_index()
top_5c = pd.concat([tmp[tmp.department == dd].sort_values('total_revenue',ascending = False).head(5) for dd in tmp.department.unique()]).reset_index(drop = True)
top_loyalty = df.merge(top_5c[['department','commodity']], on = ['department','commodity'], how = 'inner').groupby(['loyalty','department','commodity',]).agg(unique_customers = ('customer_id',pd.Series.nunique))
top_loyalty
Out[31]:
unique_customers
loyalty department commodity
First Time Buyer Deli Cheese 9
Chicken/poultry 3
Deli meats 14
Prepared food 6
Salad 4
... ... ... ...
Promiscuous Produce Apples 463
Berries 369
Potatoes 472
Salad 550
Tropical fruit 893

75 rows × 1 columns

In [32]:
tmp_1 = df.groupby(['loyalty', 'commodity']).agg(total_revenue=('price', sum)).reset_index()
top_5_1 = pd.concat(
    [tmp_1[tmp_1.loyalty==l] \
         .sort_values('total_revenue', ascending=False) \
     .head(5) for l in tmp_1.loyalty.unique()]).reset_index(drop=True);
plt.figure(figsize=(20,8))
sns.lineplot(data=top_5_1, x='loyalty', y='total_revenue', hue='commodity');
plt.xlabel('loyalty')
plt.ylabel('total_revenue');
plt.title('Top commodities by loyalty')
plt.savefig('Top commodities by loyalty')
In [33]:
fig = gp.Figure()
colors = px.colors.qualitative.Plotly
for i, cat in enumerate(top_5_1['commodity'].unique()):
    dfp = top_5_1[top_5_1['commodity']==cat]
    fig.add_trace(gp.Scatter(x=dfp['loyalty'], y = dfp['total_revenue'],
                             mode = 'markers',
                             marker_color = colors[i],
                             name = cat))
fig.show()

Analysis by Age band

In [34]:
#'Total number of Products bought by age_band'
df.groupby(['age_band']).agg(total_commodity_brand=('product_id',pd.Series.nunique)).plot(figsize=(10,5),kind = 'barh', title = 'Total number of Products bought by age_band', legend = False);
plt.savefig('Total number of Products bought by age_band')
In [35]:
tmp_stats = df.groupby(['age_band','basket_id']).agg(revenue = ('price',sum)).reset_index()
tmp_stats.groupby('age_band').agg(avg_basket_amount = ('revenue','mean'),median_basket_amount = ('revenue','median'),total_basket_amount = ('revenue','sum'),number_of_baskets = ('basket_id','count'))
Out[35]:
avg_basket_amount median_basket_amount total_basket_amount number_of_baskets
age_band
19-24 72.049567 69.960 126591.09 1757
25-34 68.540389 67.585 40575.91 592
35-44 68.865253 63.910 14943.76 217
45-54 68.990352 67.740 62712.23 909
55-64 56.148750 56.420 449.19 8
65 and above 52.538750 48.890 420.31 8
In [36]:
#to find Revenue by year based on age_band
df.groupby(['age_band','transaction_date']).agg(total_revenue = ('price',sum)).reset_index()
data = []
for a in df.age_band.unique():
    tmp = df[df.age_band==a].groupby(['transaction_date']).agg(revenue=('price', sum)).reset_index()
    data.append(gp.Scatter(x=tmp.transaction_date, y=tmp.revenue, name = a, line=dict(dash='dot')))
gp.Figure(
    data=data,
    layout = gp.Layout(
        title ='Revenue by year based on age_band',
        yaxis=dict(
            title='Revenue'
        )
    )
).show(renderer = 'iframe')
In [37]:
tmp_1 = df.groupby(['age_band','department']).agg(total_revenue=('price', sum)).reset_index()
top_5_1 = pd.concat(
    [tmp_1[tmp_1.age_band==a] \
         .sort_values('total_revenue', ascending=False) \
     .head(5) for a in tmp_1.age_band.unique()]).reset_index(drop=True)

data = []
for d in top_5_1.department.unique():
    tmp1 = top_5_1[top_5_1.department==d].groupby(['age_band']).agg(revenue=('total_revenue', sum)).reset_index()
    data.append(gp.Bar(x=tmp1.age_band, y=tmp1.revenue, name = d))
        
gp.Figure(
    data = data,
    layout = gp.Layout(
        title ='Top Departments per Age Band',
        yaxis=dict(
            title='Revenue'
        )
    )
).show(renderer = 'iframe')
In [38]:
tmp_1 = df.groupby(['age_band','commodity']).agg(total_revenue=('price', sum)).reset_index()
top_5_1 = pd.concat(
    [tmp_1[tmp_1.age_band==a] \
         .sort_values('total_revenue', ascending=False) \
     .head(5) for a in tmp_1.age_band.unique()]).reset_index(drop=True)

data = []
for c in top_5_1.commodity.unique():
    tmp1 = top_5_1[top_5_1.commodity==c].groupby(['age_band']).agg(revenue=('total_revenue', sum)).reset_index()
    data.append(gp.Bar(x=tmp1.revenue, y=tmp1.age_band, name = c,orientation='h'))

        
fig = gp.Figure(
    data = data,
    layout = gp.Layout(
        title ='Top Commodities per Age Band',
        yaxis=dict(
            title='Revenue'
        )
    )
)
fig.update_layout(barmode='stack')

Analysis by Household Type

In [39]:
#'Total number of Products bought by household_type'
df.groupby(['household_type']).agg(total_commodity_brand=('product_id',pd.Series.nunique)).plot(figsize=(10,5),kind = 'barh', title = 'Total number of Products bought by household_type', legend = False);
plt.savefig('Total number of Products bought by household_type')
In [40]:
tmp_stats = df.groupby(['household_type','basket_id']).agg(revenue = ('price',sum)).reset_index()
tmp_stats.groupby('household_type').agg(avg_basket_amount = ('revenue','mean'),median_basket_amount = ('revenue','median'),total_basket_amount = ('revenue','sum'),number_of_baskets = ('basket_id','count'))
Out[40]:
avg_basket_amount median_basket_amount total_basket_amount number_of_baskets
household_type
1 adult with kids 70.411635 67.825 59427.42 844
2 adults with kids 68.680837 67.050 71428.07 1040
2 adults with no kids 71.249542 69.675 54434.65 764
Single female 71.992895 70.195 21885.84 304
Single male 71.459202 68.690 38516.51 539
In [41]:
#to find revenue by household_type
df.transaction_date = pd.to_datetime(df.transaction_date)
df['year'] = df.transaction_date.dt.year
df['month'] = df.transaction_date.dt.month
df.groupby(['year','household_type'])['price'].sum().unstack().plot.area(figsize = (20,5), title = "Revenue by Household Type");
In [42]:
tmp = df.groupby(['household_type', 'year']).agg(revenue=('price', sum))
tmp
Out[42]:
revenue
household_type year
1 adult with kids 2017 15703.79
2018 16882.13
2019 19892.70
2020 6948.80
2 adults with kids 2017 19163.94
2018 22743.27
2019 21233.31
2020 8287.55
2 adults with no kids 2017 17618.35
2018 15002.01
2019 15806.97
2020 6007.32
Single female 2017 7294.39
2018 5871.17
2019 6011.50
2020 2708.78
Single male 2017 11254.15
2018 12121.91
2019 9693.70
2020 5446.75
In [43]:
tmp_1 = df.groupby(['household_type','commodity']).agg(total_revenue=('price', sum)).reset_index()
top_5_1 = pd.concat(
    [tmp_1[tmp_1.household_type==h] \
         .sort_values('total_revenue', ascending=False) \
     .head(5) for h in tmp_1.household_type.unique()]).reset_index(drop=True)

data = []
for c in top_5_1.commodity.unique():
    tmp1 = top_5_1[top_5_1.commodity==c].groupby(['household_type']).agg(revenue=('total_revenue', sum)).reset_index()
    data.append(gp.Bar(x=tmp1.household_type, y=tmp1.revenue, name = c))
        
gp.Figure(
    data = data,
    layout = gp.Layout(
        title ='Top Commodities per household_type',
        yaxis=dict(
            title='Revenue'
        )
    )
).show(renderer = 'iframe')
In [44]:
tmp_1 = df.groupby(['household_type','commodity']).agg(total_revenue=('price', sum)).reset_index()
top_5_1 = pd.concat(
    [tmp_1[tmp_1.household_type==h] \
         .sort_values('total_revenue', ascending=True) \
     .head(5) for h in tmp_1.household_type.unique()]).reset_index(drop=True)

data = []
for c in top_5_1.commodity.unique():
    tmp1 = top_5_1[top_5_1.commodity==c].groupby(['household_type']).agg(revenue=('total_revenue', sum)).reset_index()
    data.append(gp.Bar(x=tmp1.household_type, y=tmp1.revenue, name = c))
        
gp.Figure(
    data = data,
    layout = gp.Layout(
        title ='Bottom Commodities per household_type',
        yaxis=dict(
            title='Revenue'
        )
    )
).show(renderer = 'iframe')
In [45]:
tmp_1 = df.groupby(['household_type', 'commodity']).agg(total_revenue=('price', sum)).reset_index()
top_5_1 = pd.concat(
    [tmp_1[tmp_1.household_type==h] \
         .sort_values('total_revenue', ascending=False) \
     .head(5) for h in tmp_1.household_type.unique()]).reset_index(drop=True);
plt.figure(figsize=(20,8))
sns.lineplot(data=top_5_1, x='household_type', y='total_revenue', hue='commodity');
plt.xlabel('household_type')
plt.ylabel('total_revenue');
plt.title('Top commodities by household_type')
Out[45]:
Text(0.5, 1.0, 'Top commodities by household_type')
In [46]:
fig = px.scatter(top_5_1, x="commodity", y="total_revenue", color="household_type",title = 'Top commodities by household_type')

fig.update_traces(marker=dict(size=10,
                              line=dict(width=2,
                                        color='DarkSlateGrey')),
                  selector=dict(mode='markers'))
fig.show()

Revenue Analysis of Commodity and Department

In [48]:
#analysing frozen foods
frozen = df[df.commodity.str.lower().str.contains('frozen')].copy()
tmp = frozen.groupby(['year','month']).agg(total_revenue = ('price',sum)).reset_index()
sns.lineplot(data=tmp,x='month', y='total_revenue', hue='year' );
plt.xlabel('month')
plt.ylabel('total_revenue')
plt.title("Frozen food revenue by year")
plt.savefig("Frozen food revenue by year")
In [49]:
#analysing all commoties under the department deli
deli = df[df.commodity.str.lower().str.contains('Deli meats|Chicken|poultry|Prepared food|Snacks|Deli specialties|Cheese|Service beverage|Salad|Sushi|Sandwiches|Party trays|Deli supplies')].copy()
tmp = deli.groupby(['year','month']).agg(total_revenue = ('price',sum)).reset_index()
sns.lineplot(data=tmp,x='month', y='total_revenue', hue='year' );
plt.xlabel('month')
plt.ylabel('total_revenue')
plt.title("Deli revenue by year")
plt.savefig("Deli revenue by year")

Forecasting by ARIMA method

In [50]:
import pandas as pd
from pandas.plotting import autocorrelation_plot
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.seasonal import seasonal_decompose #library for time series analysis
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.arima.model import ARIMA
import statsmodels.api as sm
import statsmodels
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'
In [51]:
df = pd.read_csv("cleaned_dataset_2017_2020.csv")
In [52]:
df.columns
Out[52]:
Index(['customer_id', 'product_id', 'basket_id', 'loyalty', 'household_type',
       'age_band', 'department', 'brand', 'commodity', 'store',
       'transaction_date', 'price'],
      dtype='object')
In [53]:
#to change date to month beginning
df['transaction_date'] = df.transaction_date.str[:10]
df['t_date'] = pd.to_datetime(df.transaction_date)
df['t_date'] = df.t_date + pd.offsets.MonthBegin(-1)
df.head()
Out[53]:
customer_id product_id basket_id loyalty household_type age_band department brand commodity store transaction_date price t_date
0 15803 1131974 57266 Loyalist 1 adult with kids 19-24 Grocery private Baked bread/buns/rolls 374 2018-10-18 0.99 2018-10-01
1 15803 1051516 57266 Loyalist 1 adult with kids 19-24 Produce national Vegetables - all others 374 2018-10-18 0.70 2018-10-01
2 15803 967254 57266 Loyalist 1 adult with kids 19-24 Pharmaceutical national Cold and flu 374 2018-10-18 1.68 2018-10-01
3 15803 1134222 57266 Loyalist 1 adult with kids 19-24 Grocery private Paper housewares 374 2018-10-18 2.59 2018-10-01
4 15803 1003421 57266 Loyalist 1 adult with kids 19-24 Grocery national Soup 374 2018-10-18 0.60 2018-10-01
In [54]:
ts = df[(df.department ==  'Deli')].groupby(['t_date']).agg(total_revenue=('price', sum)).reset_index()
ts.head()
Out[54]:
t_date total_revenue
0 2017-01-01 183.19
1 2017-02-01 293.15
2 2017-03-01 354.16
3 2017-04-01 393.33
4 2017-05-01 244.46
In [55]:
training = ts.loc[ts.t_date < '2020-01-01'].set_index('t_date')
training.shape
Out[55]:
(36, 1)
In [56]:
training.plot();
In [57]:
ts_components = seasonal_decompose(training)
ts_components.plot();
In [58]:
# Using mean and variance to check if time series is stationary
split = round(len(training) / 2)
x1 = training[0:split]
x2 = training[split:]
mean1= x1.mean()
mean2= x2.mean()
print("Mean 1 & 2= ", mean1[0], mean2[0])
var1=x1.var()
var2=x2.var()
print("Variance 1 & 2= ",var1[0], var2[0])
Mean 1 & 2=  331.7844444444444 379.4644444444445
Variance 1 & 2=  7199.863873202616 11537.822755555557
In [59]:
test_adf = adfuller(training)
print('ADF test = ', test_adf[0])
print('p-value = ', test_adf[1])
ADF test =  -4.851369879408998
p-value =  4.3328290099670464e-05
In [60]:
#Given that the ADF value is negative and p-value < 0.05, we can reject the null hyphotesis and tell that our time series is stationary. Now we can apply a forecasting method.
autocorrelation_plot(training);
In [61]:
test = ts.loc[ts.t_date >= '2020-01-01'].set_index('t_date')
In [62]:
whole = ts.set_index('t_date').squeeze().copy()
history = whole.take(range(36))
future = test.squeeze().copy()
future
Out[62]:
t_date
2020-01-01    287.27
2020-02-01    271.27
2020-03-01    395.47
2020-04-01    430.60
2020-05-01    527.45
Name: total_revenue, dtype: float64
In [63]:
for t in range(len(future)):
    model = ARIMA(pd.DataFrame(history).asfreq('MS'), order=(5,0,0))
    model_fit = model.fit()
    output = model_fit.get_forecast(steps=1)
    yhat = output.predicted_mean[0].round(2)
    stderr = output.se_mean[0].round(2)
    confint = output.conf_int().to_numpy().round(2)
    month = future.index[t]
    obs = future[t].round(2)
    print(month)
    print('prediction:', yhat, ', expected:', obs, ', stderr:', stderr, ', conf. int:', confint)
    history = whole.take(range(36 + t+1))
2020-01-01 00:00:00
prediction: 401.54 , expected: 287.27 , stderr: 92.91 , conf. int: [[219.44 583.64]]
2020-02-01 00:00:00
prediction: 315.18 , expected: 271.27 , stderr: 93.76 , conf. int: [[131.42 498.94]]
2020-03-01 00:00:00
prediction: 358.81 , expected: 395.47 , stderr: 92.51 , conf. int: [[177.49 540.13]]
2020-04-01 00:00:00
prediction: 361.86 , expected: 430.6 , stderr: 91.52 , conf. int: [[182.49 541.23]]
2020-05-01 00:00:00
prediction: 373.07 , expected: 527.45 , stderr: 91.17 , conf. int: [[194.39 551.76]]
In [64]:
#forecasting dor 2 years
model = ARIMA(pd.DataFrame(history).asfreq('MS'), order=(5,0,0), freq='MS')
model_fit = model.fit()
output = model_fit.get_forecast(steps=18)
output.predicted_mean
Out[64]:
2020-06-01    375.716399
2020-07-01    317.950893
2020-08-01    363.188093
2020-09-01    375.461597
2020-10-01    391.049044
2020-11-01    368.116726
2020-12-01    347.682487
2021-01-01    356.893258
2021-02-01    362.772099
2021-03-01    364.904601
2021-04-01    361.316481
2021-05-01    356.130650
2021-06-01    357.356308
2021-07-01    359.388657
2021-08-01    359.802343
2021-09-01    359.121298
2021-10-01    357.967713
2021-11-01    358.021157
Freq: MS, Name: predicted_mean, dtype: float64
In [65]:
output.predicted_mean.plot();
In [66]:
output.conf_int().round(2)
Out[66]:
lower total_revenue upper total_revenue
2020-06-01 194.07 557.36
2020-07-01 131.72 504.18
2020-08-01 175.41 550.97
2020-09-01 187.68 563.24
2020-10-01 203.27 578.83
2020-11-01 177.58 558.66
2020-12-01 156.43 538.93
2021-01-01 165.52 548.27
2021-02-01 171.38 554.16
2021-03-01 173.52 556.29
2021-04-01 169.84 552.79
2021-05-01 164.60 547.66
2021-06-01 165.82 548.89
2021-07-01 167.85 550.92
2021-08-01 168.27 551.34
2021-09-01 167.58 550.66
2021-10-01 166.43 549.51
2021-11-01 166.48 549.56
In [67]:
plt.figure(figsize=(12,5), dpi=75);
plt.plot(history, label='history');
plt.plot(future, label='future');
plt.plot(output.predicted_mean, label='forecast');
plt.title('Forecast vs Actuals for Deli');
plt.legend(loc='upper left', fontsize=8);
plt.show();

ARIMA projection for Frozen products

In [69]:
frozen = df[df.commodity.str.lower().str.contains('frozen')].copy()
ts = frozen.groupby(['t_date']).agg(total_revenue = ('price',sum)).reset_index()
ts.head()
Out[69]:
t_date total_revenue
0 2017-01-01 542.47
1 2017-02-01 481.42
2 2017-03-01 575.11
3 2017-04-01 394.69
4 2017-05-01 529.46
In [70]:
training = ts.loc[ts.t_date < '2020-01-01'].set_index('t_date')
training.shape
Out[70]:
(36, 1)
In [71]:
training.plot();
In [72]:
training.plot();
In [73]:
split = round(len(training)/2)
x1 = training[0:split]
x2 = training[split:]

mean1 = x1.mean()
mean2 = x2.mean()
print("Mean 1 & 2 = ", round(mean1[0]),round(mean2[0]))

var1 = x1.var()
var2 = x2.var()
print("Variance  1 & 2 = ", round(var1[0]),round(var2[0]))
Mean 1 & 2 =  431 411
Variance  1 & 2 =  13260 19881
In [74]:
test_adf = adfuller(training)
print('ADF test = ',test_adf[0])
print('p-value = ',test_adf[1])
ADF test =  -3.414554075110886
p-value =  0.010472617702400572
In [75]:
#Given that the ADF value is negative and p-value < 0.05, we can reject the null hyphotesis and tell that our time series is stationary. Now we can apply a forecasting method.
autocorrelation_plot(training);
In [76]:
test = ts.loc[ts.t_date >= '2020-01-01'].set_index('t_date')
print(test)
            total_revenue
t_date                   
2020-01-01         358.70
2020-02-01         508.11
2020-03-01         602.20
2020-04-01         551.03
2020-05-01         523.47
In [77]:
whole = ts.set_index('t_date').squeeze().copy()
history = whole.take(range(36))
future = test.squeeze().copy()
future
Out[77]:
t_date
2020-01-01    358.70
2020-02-01    508.11
2020-03-01    602.20
2020-04-01    551.03
2020-05-01    523.47
Name: total_revenue, dtype: float64
In [78]:
for t in range(len(future)):
    model = ARIMA(pd.DataFrame(history).asfreq('MS'), order=(3,0,0))
    model_fit = model.fit()
    output = model_fit.get_forecast(steps=1)
    yhat = output.predicted_mean[0].round(2)
    stderr = output.se_mean[0].round(2)
    confint = output.conf_int().to_numpy().round(2)
    month = future.index[t]
    obs = future[t].round(2)
    print(month)
    print('prediction:', yhat, ', expected:', obs, ', stderr:', stderr, ', conf. int:', confint)
    history = whole.take(range(36 + t+1))
2020-01-01 00:00:00
prediction: 506.62 , expected: 358.7 , stderr: 117.14 , conf. int: [[277.04 736.2 ]]
2020-02-01 00:00:00
prediction: 431.37 , expected: 508.11 , stderr: 117.51 , conf. int: [[201.06 661.69]]
2020-03-01 00:00:00
prediction: 321.16 , expected: 602.2 , stderr: 116.45 , conf. int: [[ 92.92 549.41]]
2020-04-01 00:00:00
prediction: 429.39 , expected: 551.03 , stderr: 121.65 , conf. int: [[190.97 667.81]]
2020-05-01 00:00:00
prediction: 399.2 , expected: 523.47 , stderr: 121.35 , conf. int: [[161.35 637.04]]
In [79]:
model = ARIMA(pd.DataFrame(history).asfreq('MS'), order=(3,0,0), freq='MS')
model_fit = model.fit()
output = model_fit.get_forecast(steps=18)
output.predicted_mean.plot();
In [80]:
plt.figure(figsize=(12,5), dpi=75);
plt.plot(history, label='history');
plt.plot(future, label='future');
plt.plot(output.predicted_mean, label='forecast');
plt.title('Forecast vs Actuals for Frozen foods');
plt.legend(loc='upper left', fontsize=8);
plt.show();
In [81]:
output.conf_int().round(2)
Out[81]:
lower total_revenue upper total_revenue
2020-06-01 164.08 638.98
2020-07-01 161.14 638.34
2020-08-01 175.35 654.07
2020-09-01 192.16 677.51
2020-10-01 193.00 678.49
2020-11-01 188.94 674.57
2020-12-01 185.31 671.15
2021-01-01 185.18 671.02
2021-02-01 186.08 671.94
2021-03-01 186.71 672.58
2021-04-01 186.71 672.58
2021-05-01 186.52 672.38
2021-06-01 186.40 672.27
2021-07-01 186.41 672.27
2021-08-01 186.45 672.31
2021-09-01 186.47 672.33
2021-10-01 186.47 672.33
2021-11-01 186.46 672.32

Forecasting using moving average method

In [82]:
revenue =df.groupby(['t_date']).agg(total_revenue = ('price',sum)).reset_index()
revenue.head()
Out[82]:
t_date total_revenue
0 2017-01-01 5154.58
1 2017-02-01 5625.23
2 2017-03-01 6181.24
3 2017-04-01 5583.59
4 2017-05-01 5362.04
In [83]:
frozen = df[df.commodity.str.lower().str.contains('frozen')].copy()
frozen.head()
Out[83]:
customer_id product_id basket_id loyalty household_type age_band department brand commodity store transaction_date price t_date
5 15803 937791 57266 Loyalist 1 adult with kids 19-24 Grocery national Frozen pizza 374 2018-10-18 2.25 2018-10-01
15 15803 987311 57266 Loyalist 1 adult with kids 19-24 Seafood national Seafood-frozen 374 2018-10-18 2.50 2018-10-01
37 15811 1012594 57381 Promiscuous Single female 19-24 Nutrition national Frozen 374 2019-10-18 4.00 2019-10-01
80 15818 1121213 62607 Loyalist 2 adults with no kids 19-24 Grocery national Frozen meat 374 2017-12-18 3.58 2017-12-01
89 15820 1072843 52166 Loyalist 2 adults with kids 25-34 Grocery national Frozen bread/dough 374 2019-08-18 3.98 2019-08-01
In [84]:
frozen_ts = frozen.groupby(['t_date']).agg(total_revenue=('price', sum))
frozen_ts.head(36)
Out[84]:
total_revenue
t_date
2017-01-01 542.47
2017-02-01 481.42
2017-03-01 575.11
2017-04-01 394.69
2017-05-01 529.46
2017-06-01 377.45
2017-07-01 573.54
2017-08-01 623.00
2017-09-01 250.17
2017-10-01 325.93
2017-11-01 288.66
2017-12-01 493.14
2018-01-01 431.70
2018-02-01 399.87
2018-03-01 512.09
2018-04-01 342.27
2018-05-01 262.64
2018-06-01 347.35
2018-07-01 587.81
2018-08-01 555.65
2018-09-01 327.74
2018-10-01 281.44
2018-11-01 367.35
2018-12-01 415.56
2019-01-01 266.69
2019-02-01 293.93
2019-03-01 479.26
2019-04-01 655.28
2019-05-01 453.76
2019-06-01 355.10
2019-07-01 306.40
2019-08-01 457.89
2019-09-01 336.06
2019-10-01 334.36
2019-11-01 213.02
2019-12-01 715.14
In [85]:
frozen_ts.plot();
In [86]:
frozen_tail = frozen_ts.rolling(window=6) 
moving_avg = frozen_tail.mean()
print(moving_avg.head(41))
            total_revenue
t_date                   
2017-01-01            NaN
2017-02-01            NaN
2017-03-01            NaN
2017-04-01            NaN
2017-05-01            NaN
2017-06-01     483.433333
2017-07-01     488.611667
2017-08-01     512.208333
2017-09-01     458.051667
2017-10-01     446.591667
2017-11-01     406.458333
2017-12-01     425.740000
2018-01-01     402.100000
2018-02-01     364.911667
2018-03-01     408.565000
2018-04-01     411.288333
2018-05-01     406.951667
2018-06-01     382.653333
2018-07-01     408.671667
2018-08-01     434.635000
2018-09-01     403.910000
2018-10-01     393.771667
2018-11-01     411.223333
2018-12-01     422.591667
2019-01-01     369.071667
2019-02-01     325.451667
2019-03-01     350.705000
2019-04-01     413.011667
2019-05-01     427.413333
2019-06-01     417.336667
2019-07-01     423.955000
2019-08-01     451.281667
2019-09-01     427.415000
2019-10-01     373.928333
2019-11-01     333.805000
2019-12-01     393.811667
2020-01-01     402.528333
2020-02-01     410.898333
2020-03-01     455.255000
2020-04-01     491.366667
2020-05-01     543.108333
In [87]:
fig, ax = plt.subplots(1,1)
frozen_ts.plot(ax=ax);
moving_avg.plot(color = 'red', ax=ax, title = "Forecast for Frozen foods")
plt.legend(['current','forecast'])
plt.show();
In [88]:
deli = df[(df.department ==  'Deli')].copy()
deli.head()
Out[88]:
customer_id product_id basket_id loyalty household_type age_band department brand commodity store transaction_date price t_date
9 15803 845307 57266 Loyalist 1 adult with kids 19-24 Deli national Deli meats 374 2018-10-18 3.17 2018-10-01
56 15815 1052729 52250 Loyalist Single female 19-24 Deli national Chicken/poultry 374 2019-08-18 3.96 2019-08-01
57 15815 9524291 52250 Loyalist Single female 19-24 Deli private Prepared food 374 2019-08-18 6.99 2019-08-01
82 15818 933835 62607 Loyalist 2 adults with no kids 19-24 Deli national Deli meats 374 2017-12-18 3.11 2017-12-01
94 15820 1052729 52166 Loyalist 2 adults with kids 25-34 Deli national Chicken/poultry 374 2019-08-18 3.96 2019-08-01
In [89]:
deli_ts = deli.groupby(['t_date']).agg(total_revenue=('price', sum))
deli_ts.plot();
In [90]:
train_len = 36
train = deli_ts[0:train_len]
test = deli_ts[train_len:]
In [91]:
y_hat_sma = deli_ts.copy()
ma_window = 6
y_hat_sma['sma_forecast'] = deli_ts['total_revenue'].rolling(ma_window).mean()
y_hat_sma['sma_forecast'][train_len:] = y_hat_sma['sma_forecast'][train_len-1]
In [92]:
plt.figure(figsize=(10,5));
plt.grid();
plt.plot(train['total_revenue'], label='Train');
plt.plot(test['total_revenue'], label='Test')
plt.plot(y_hat_sma['sma_forecast'], label='Simple moving average forecast');
plt.legend(loc='best');
plt.title('Moving Average Method Forecast for Deli department');
plt.show();
In [ ]: